Install packages if required
if (!requireNamespace("BiocManager", quietly = TRUE)){
install.packages("BiocManager")
}
if (!requireNamespace("GEOmetadb", quietly = TRUE)){
BiocManager::install("GEOmetadb")}
if (!requireNamespace("knitr", quietly = TRUE)){
install.packages("knitr")}
if (!requireNamespace("edgeR", quietly = TRUE)){
install.packages("edgeR")}
if (!requireNamespace("readxl", quietly = TRUE)){
BiocManager::install("readxl")}
if (!requireNamespace("RColorBrewer", quietly = TRUE)){
install.packages("RColorBrewer")}
if (!requireNamespace("biomaRt", quietly = TRUE)){
install.packages("biomaRt")}
[1] [2] [3] [4] [5] [6] [7] [8]
Load Packages
library("GEOmetadb")
library("knitr")
library("edgeR")
library("readxl")
library("RColorBrewer")
library("biomaRt")
gse <- getGEO("GSE179448",GSEMatrix=FALSE)
kable(data.frame(head(Meta(gse))), format = "html")
| contact_address | contact_city | contact_country | contact_department | contact_email | contact_institute |
|---|---|---|---|---|---|
| 77 Avenue Louis Pasteur | Boston | USA | Microbiology and Immunobiology | cbdm@hms.harvard.edu | Harvard Medical School |
current_gpl <- names(GPLList(gse))[1]
current_gpl_info <- Meta(getGEO(current_gpl))
[9] [10]
Platform title: Illumina NextSeq 500 (Homo
sapiens)
Submission data:Apr 15 2014
Last Update data: Mar 26 2019
Organism: Homo sapiens
Number of GEO
datasets using this technology: 9931
Number of
GEO samples that use this technology:291658
sfiles = getGEOSuppFiles('GSE179448')
fnames = rownames(sfiles)
fnames
## [1] "/home/rstudio/projects/GSE179448/GSE179448_Gene_count_table.tsv.gz"
## [2] "/home/rstudio/projects/GSE179448/GSE179448_Metadata_GalvanLeon_COVID19_Treg_ULIRNAseq.xlsx"
There are two supplementary files, the first one is a gene
count table file, the second is the metadata of RNA seq(This will be
used later to define groups). Here, I chose the first one with the count
data.
# Get count data if we we have not already downloaded.
file_dowloaded <- "texp.rds"
if (file.exists(file_dowloaded)) {
texp <- readRDS(file_dowloaded)
} else {
texp = read.delim(fnames[1],header=TRUE, check.names = FALSE)
saveRDS(texp, file_dowloaded)
}
kable(texp[1:15,1:10], format = "html")
| gene_symbol | RR0438670.020620.TR#1 | RR0947025.020520.TR#1 | RR0611936.010220.TR#1 | AC3034431.110619.TR#1 | AC3032552.092619.TR#1 | HH0000002.072820.TR#1 | HH0000003.082520.TR#1 | RR0277045.050520.TR#1 | RR0866687.050720.TR#1 |
|---|---|---|---|---|---|---|---|---|---|
| 7SK | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 |
| A1BG | 26 | 10 | 7 | 4 | 6 | 1 | 4 | 7 | 19 |
| A1BG-AS1 | 8 | 4 | 17 | 7 | 11 | 5 | 11 | 5 | 35 |
| A1CF | 1 | 10 | 2 | 4 | 4 | 3 | 0 | 0 | 2 |
| A2M | 65 | 155 | 47 | 42 | 94 | 29 | 45 | 116 | 23 |
| A2M-AS1 | 45 | 56 | 15 | 26 | 43 | 11 | 5 | 46 | 9 |
| A2ML1 | 2 | 3 | 3 | 8 | 5 | 5 | 4 | 2 | 3 |
| A2ML1-AS1 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 |
| A2ML1-AS2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| A2MP1 | 9 | 17 | 6 | 11 | 3 | 3 | 3 | 13 | 7 |
| A3GALT2 | 3 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 |
| A4GALT | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
| A4GNT | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 2 | 0 |
| AA06 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| AAAS | 202 | 123 | 233 | 85 | 156 | 176 | 218 | 218 | 255 |
The first column of the data is the gene symbols, and the
other columns are the sampleIDs(The details about sampleIDs will be
explained later).
dim(texp)
## [1] 56120 87
The result shows that there are 56120 genes in this table which is not realistic,so we definitely need to clean this data.
colnames(texp)
## [1] "gene_symbol" "RR0438670.020620.TR#1" "RR0947025.020520.TR#1"
## [4] "RR0611936.010220.TR#1" "AC3034431.110619.TR#1" "AC3032552.092619.TR#1"
## [7] "HH0000002.072820.TR#1" "HH0000003.082520.TR#1" "RR0277045.050520.TR#1"
## [10] "RR0866687.050720.TR#1" "RR0919381.050520.TR#1" "RR0270554.040720.TR#1"
## [13] "RR0650386.033120.TR#1" "RR0921314.060320.TR#1" "RR0415843.060920.TR#1"
## [16] "RR0915093.033120.TR#1" "RR0183111.051520.TR#1" "RR0318907.051220.TR#1"
## [19] "RR0511931.040820.TR#1" "RR0783978.051820.TR#1" "RR0951931.042820.TR#1"
## [22] "RR0236312.051220.TR#1" "RR0248668.050520.TR#1" "RR0467460.050520.TR#1"
## [25] "RR0602232.050120.TR#1" "RR0615993.052220.TR#1" "RR0424088.041520.TR#1"
## [28] "RR0469109.051920.TR#1" "RR0157115.051220.TR#1" "RR0907841.051820.TR#1"
## [31] "RR0997256.051220.TR#1" "RR0417079.041520.TR#1" "RR0579992.041720.TR#1"
## [34] "RR0685229.041620.TR#1" "RR0823855.041320.TR#1" "RR0830593.041320.TR#1"
## [37] "RR0857858.042720.TR#1" "RR0941176.041720.TR#1" "RR0234740.072320.TR#1"
## [40] "RR0243818.071420.TR#1" "RR0370807.042120.TR#1" "RR0931294.060520.TR#1"
## [43] "RR0941834.072320.TR#1" "RR0306970.042120.TR#1" "RR0509389.051920.TR#1"
## [46] "RR0542663.051820.TR#1" "RR0438670.020620.TC#1" "RR0865315.021220.TC#1"
## [49] "AC3034430.110519.TC#1" "RR0947025.020520.TC#1" "RR0890519.012220.TC#1"
## [52] "RR0611936.010220.TC#1" "HH0000002.072820.TC#1" "HH0000003.082520.TC#1"
## [55] "RR0213942.051420.TC#1" "RR0524129.050720.TC#1" "RR0849884.051320.TC#1"
## [58] "RR0866687.050720.TC#1" "RR0915093.033120.TC#1" "RR0183111.051520.TC#1"
## [61] "RR0318907.051220.TC#1" "RR0511931.040820.TC#1" "RR0736447.041020.TC#1"
## [64] "RR0248668.050520.TC#1" "RR0378574.050720.TC#1" "RR0467460.050520.TC#1"
## [67] "RR0602232.050120.TC#1" "RR0615993.052220.TC#1" "RR0424088.041520.TC#1"
## [70] "RR0469109.051920.TC#1" "RR0656234.050720.TC#1" "RR0614457.051920.TC#1"
## [73] "RR0157115.051220.TC#1" "RR0907841.051820.TC#1" "RR0997256.051220.TC#1"
## [76] "RR0823855.041320.TC#1" "RR0830593.041320.TC#1" "RR0857858.042720.TC#1"
## [79] "RR0941176.041720.TC#1" "RR0234740.072320.TC#1" "RR0243818.071420.TC#1"
## [82] "RR0370807.042120.TC#1" "RR0931294.060520.TC#1" "RR0941834.072320.TC#1"
## [85] "RR0306970.042120.TC#1" "RR0509389.051920.TC#1" "RR0542663.051820.TC#1"
The column names is formatted in xxx.yyy.zzz format(for example: RR0947025.020520.TC#1). From reading the second supplementary file, xxx represents the donor ID, yyy represents the processing batch, and zzz represents the cell type. TR represents T regulatory cells that downregulate the immune response, whereas Tconv represents the conventional T cells that will differentiate into effector cells during immune response.
# To extract all sample names from column names
samples <- data.frame(lapply(colnames(texp)[2:87],
FUN=function(x){unlist(strsplit(x,
split = "\\."))}))
#format the texp_filtered with new row names for better representations in plot
#The new row names are only missing the processing batch.
samplenames <- paste0(samples[1,], samples[3,])
colnames(texp) <- c("gene_symbol", samplenames)
# Read the second supplementary file from the GSE and get donor type information
donor_dowloaded <- "tmeta.rds"
if (file.exists(donor_dowloaded)) {
tmeta <- readRDS(donor_dowloaded)
} else {
tmeta = read_xlsx(fnames[2], col_names = TRUE, skip = 1)
saveRDS(tmeta, donor_dowloaded)
}
# Assign donor_type value
donor_type <- lapply(samples[1,],
FUN=function(x){
tmeta$Severity[which(tmeta$`Donor ID` == x)]})
samples[nrow(samples)+1,] <- donor_type
# Assign group(donor type + cell type)
all_type <- paste0(samples[4,], samples[3,])
samples[nrow(samples)+1,] <- all_type
# Assign rownames and colnames
colnames(samples) <- colnames(texp)[2:87]
rownames(samples) <- c("patients","batch","cell_type", "donor_type", "all_type")
samples <- data.frame(t(samples))
kable(samples[1:10,1:5], format = "html")
| patients | batch | cell_type | donor_type | all_type | |
|---|---|---|---|---|---|
| RR0438670TR#1 | RR0438670 | 020620 | TR#1 | Healthy | HealthyTR#1 |
| RR0947025TR#1 | RR0947025 | 020520 | TR#1 | Healthy | HealthyTR#1 |
| RR0611936TR#1 | RR0611936 | 010220 | TR#1 | Healthy | HealthyTR#1 |
| AC3034431TR#1 | AC3034431 | 110619 | TR#1 | Healthy | HealthyTR#1 |
| AC3032552TR#1 | AC3032552 | 092619 | TR#1 | Healthy | HealthyTR#1 |
| HH0000002TR#1 | HH0000002 | 072820 | TR#1 | Healthy | HealthyTR#1 |
| HH0000003TR#1 | HH0000003 | 082520 | TR#1 | Healthy | HealthyTR#1 |
| RR0277045TR#1 | RR0277045 | 050520 | TR#1 | hospitalized moderate to severe | hospitalized moderate to severeTR#1 |
| RR0866687TR#1 | RR0866687 | 050720 | TR#1 | hospitalized moderate to severe | hospitalized moderate to severeTR#1 |
| RR0919381TR#1 | RR0919381 | 050520 | TR#1 | hospitalized moderate to severe | hospitalized moderate to severeTR#1 |
This table shows the sample groups. We can see there are four
different types of donors: healthy, recovered, mild outpatient, and
hospitalized moderate to severe. There are two different T cells being
sequneced: Treg(TR) and Tconv(TC).
Question1: What are
the control and test conditions of the dataset?
There
are two different ways we can look at the dataset. First, we can compare
the expression of Treg/Tcov across the donor groups, for exmaple,
expression of Treg in healthy people versus hospitalized patients. In
this case, the control is going to be the healthy donors, and the test
conditions are the other three groups: Recovered, Mild Outpatient, and
Hospitalized moderate to severe. Second, we can compare the expression
of Treg to Tconv in one specific donor group, for example, compare the
expression of Treg to Tconv in only the hospitalized patients. In this
case, the control is the Tconv expression, and the tested condition is
the Tconv expression level.
Question2: Why us the dataset
of interest to you?
This dataset is related to COVID
immune respnose. I have heard about results saying how immune response
and T cells activity is different during the cytokine storm in patients,
but I have never dealt with datasets explaining this. This dataset gave
me a handson activity to look at how they come to the conclusion and
what we learn from the RNASeq data from different T cells.
summarized_gene_counts_transcripts <- sort(table(texp$gene_symbol),
decreasing = TRUE)
kable(table(texp$gene_symbol)[1:5], format="html")
| Var1 | Freq |
|---|---|
| 7SK | 1 |
| A1BG | 1 |
| A1BG-AS1 | 1 |
| A1CF | 1 |
| A2M | 1 |
The table here shows no duplicated gene symbols are found(I have a hypothesis why this happened, and will be explained in Question 3 in “Identifier Mapping” section).
I chose to use EdgeR here because I wanted to explore the differences in gene differential expression analysis using two different method. Previous method comparason has shown that DeepSeq package seems to be better as it can identify more genes. However, as mentioned in class, if the normalization methods are so similar, what leads to this difference? Am I still going to be able to identify the differentially expressed genes using a different package?
#translate out counts into counts per million using the edgeR package function cpm
cpms = cpm(texp[,2:87])
rownames(cpms) <- texp[,1]
#filter out low counts
keep10 = rowSums(cpms >1) >= 10
keep43 = rowSums(cpms >1) >= 43
texp_filtered10 = texp[keep10,]
texp_filtered = texp[keep43,]
#check dataset dimension again
dim(texp)
## [1] 56120 87
dim(texp_filtered)
## [1] 16654 87
Questio3: How many outliers are removed?
From filter out low counts genes, we have removed about two
thrid(39466 outliers) of the genes in the dataset to remove outliers.
This step was optimized multiple times. My original filtering chose
keep = rowSums(cpms >1) >= 10 which is too relaxed
considering the number of samples I have(43 for each cell types).
Therefore, Dr. Isserlin suggested a more stringent filtering, and I
decided to use 43 as the filtering threshold. This means that the gene
has to have a cpm greater than 1 and exists in at least 43 samples. This
helped with cleaning up the data to give it a better
distribution(comparison in next section).
data2plot10 <- log2(cpm(texp_filtered10[,2:87]))
boxplot(data2plot10, xlab = "", ylab = "log2 CPM",
las = 2, cex = 0.5, cex.lab = 0.8,
cex.axis = 0.5, main = "Treg/Tconv RNASeq Samples before Optimization"
)
title(xlab = "Samples", line = 4, cex.lab = 0.8)
#draw the median on each box plot
abline(h = median(apply(data2plot10, 2, median)),
col = "green", lwd = 0.8, lty = "dashed")
If we compare the box plot before and after normalization, the median of the log2CPM value increased suggesting more low counts have been filtered out.
We can also compare the density plot before and after optimization.
counts_density <- apply(log2(cpm(texp_filtered[,2:87])),
2, density)
#calculate the limits across all the samples
xlim <- 0; ylim <- 0
for (i in 1:length(counts_density)) {
xlim <- range(c(xlim, counts_density[[i]]$x));
ylim <- range(c(ylim, counts_density[[i]]$y))
}
cols <- rainbow(length(counts_density))
ltys <- rep(1, length(counts_density))
#plot the first density plot to initialize the plot
plot(counts_density[[1]], xlim=xlim, ylim=ylim, type="n",
ylab="Smoothing density of log2-CPM",
main="Density Plot of Treg/Tconv RNASeq Samples Before Optimization", cex.lab = 0.75)
#plot each line
for (i in 1:length(counts_density))
lines(counts_density[[i]], col=cols[i], lty=ltys[i])
#create legend
legend("topright", colnames(data2plot),
col=cols, lty=ltys, cex=0.35,
border ="blue", text.col = "green4",
merge = TRUE, bg = "gray90",
ncol = 3)
As you can see in the before vs after optimization, the first spike became smaller which indicates the dataset is more clean after teh optimization. However, the data is not perfect here as it still has the first spike meaning the distribution of data does not centered.
plotMA(log2(texp_filtered[,c(2,21)]), ylab="M - ratio log expression",
main="Treg Healthy vs Hospitalized Patient")
This is an example of an MA plot showing the expression of Treg in a Healthy person vs a hospitalized patient with moderate to severe symptoms. Each dot represent a gene. We can see some gene have a pretty big fold change in expression level as the dots shown in the upper and lower left of the plot. The average log-expression of genes are mostly around 5, and a big percentage of genes are concentrated in the centre. We will then normalize the data, and try to remove the more dispeased points in the plot.
#Change the texp_filtered into a matrix
filtered_data_matrix <- as.matrix(texp_filtered[,2:87])
rownames(filtered_data_matrix) <- texp_filtered$gene_symbol
d = DGEList(counts=filtered_data_matrix, group=samples$all_type)
d = calcNormFactors(d)
normalized_counts <- cpm(d)
data2plot <- log2(normalized_counts)
boxplot(data2plot, xlab = "", ylab = "log2 CPM",
las = 2, cex = 0.5, cex.lab = 0.8,
cex.axis = 0.5, main = "Treg/Tconv RNASeq Samples after Normalization"
)
title(xlab = "Samples", line = 4, cex.lab = 0.8)
#draw the median on each box plot
abline(h = median(apply(data2plot, 2, median)),
col = "green", lwd = 0.8, lty = "dashed")
If you compare the boxplot before and after normalization, we can see that the log2CPM and median of log2CPM does not really show any changes. However, if you look at the green dashline, the median of each sample/replicates became more aligned. ### Density Plot
counts_density <- apply(log2(normalized_counts),
2, density)
#calculate the limits across all the samples
xlim <- 0; ylim <- 0
for (i in 1:length(counts_density)) {
xlim <- range(c(xlim, counts_density[[i]]$x));
ylim <- range(c(ylim, counts_density[[i]]$y))
}
cols <- rainbow(length(counts_density))
ltys <- rep(1, length(counts_density))
#plot the first density plot to initialize the plot
plot(counts_density[[1]], xlim=xlim, ylim=ylim, type="n",
ylab="Smoothing density of log2-CPM",
main="Density Plot of Treg/Tconv RNASeq Samples after Normalization", cex.lab = 0.75)
#plot each line
for (i in 1:length(counts_density))
lines(counts_density[[i]], col=cols[i], lty=ltys[i])
#create legend
legend("topright", colnames(data2plot),
col=cols, lty=ltys, cex=0.35,
border ="blue", text.col = "green4",
merge = TRUE, bg = "gray90",
ncol = 3)
If you compare the density plot before and after normalization, you can see the N and bandwidth of course does not change. The lines representing each sample became more compact which means they align better while the distribution of data remains the same.
Question4: How did you handle replicates?
There are 86 samples in this dataset. Replicates are handled by
removing lower counts from the dataset and normalization of the data.
Removing the outliers helps remove the noises and non-informative
outputs. Normalization of data helps to reduve biological and technical
variances.
## MDS plot
par(mar = c(8, 4, 2, 2), xpd = TRUE)
plotMDS(d, col = brewer.pal(8,"Paired")[factor(samples$all_type)], pch = 19,
main = "MDS plot for Treg/Tconv RNASeq Samples")
#create legend
legend("bottom", legend = levels(d$samples$group),
c=brewer.pal(8,"Paired"), cex=0.7, inset = c(0, -0.57),
pch = 19, ncol = 2)
From the MDS plot, we can see the distances between different groups. Here, each group have a main color, and each cell type have a light or darker color. For exmaple, Tconv cells RNASeq in healthy donors are represented in light blue and Treg cells RNASeq are represented in darker blue. This is easier for visualization and pairwise comparison. As you can see, all TC samples cluster on the right half of the graph and all TR samples cluster on the left half of the cell. If you take a closer look at each donor groups, you can see that data points in one group tends to be in closer vicinity than the data times in two groups. However, the mild outpatients samples seem to be right in the middle of other samples. # Dispersion
model_design <- model.matrix(~samples$patients
+ samples$cell_type+0)
d_cell <- estimateDisp(d, model_design)
plotBCV(d_cell,col.tagwise = "black",col.common = "red",
main= "BCV plot for Treg/Tconv RNASeq Samples")
Each dot represents the biological coefficient variation. Genes with low counts will have higher variance as shown by the trend line. The variation seen here can come from either biological or technical source in a RNASeq experiment. # Mean-variance relationship plot
plotMeanVar(d_cell, show.raw.vars = TRUE,
show.tagwise.vars=TRUE,
NBline=TRUE, show.ave.raw.vars = TRUE,
show.binned.common.disp.vars = TRUE,
main = "Mean-variance Plot for Treg/Tconv RNASeq Samples")
The mean-variance plot creates a visual representation of mean-variance relationship. The grey dots are the raw vars, the blue represents the tagwise/genewise vaes, the darker red line(avg of raw vars) is covered by the red line(the binned common dispersion vars), and lastly the NBline is shown in blue. # Identifier mapping
ensembl <- useMart("ensembl", version = "Ensembl Genes 109")
ensembl_93 <- useMart("ensembl",
host = "https://jul2018.archive.ensembl.org",
version = "Ensembl Genes 93")
ensembl = useDataset("hsapiens_gene_ensembl",mart=ensembl)
ensembl_93 = useDataset("hsapiens_gene_ensembl",mart=ensembl_93)
biomart_human_filters <- listFilters(ensembl)
kable(biomart_human_filters[
grep(biomart_human_filters$name,pattern="hgnc"),],
format="html")
| name | description | |
|---|---|---|
| 22 | with_hgnc | With HGNC Symbol ID(s) |
| 46 | with_hgnc_trans_name | With Transcript name ID(s) |
| 77 | hgnc_id | HGNC ID(s) [e.g. HGNC:100] |
| 78 | hgnc_symbol | HGNC symbol(s) [e.g. A1BG] |
| 105 | hgnc_trans_name | Transcript name ID(s) [e.g. A1BG-201] |
Because out datasets used HGNC gene symbols(names) as the identifier, we will select this(78 hgnc_symbol) as the filter.
conversion_stash_cur <- "gene_conversion_cur.rds"
if (file.exists(conversion_stash_cur)) {
gene_conversion_cur <- readRDS(conversion_stash_cur)
} else {
gene_conversion_cur <- biomaRt::getBM(attributes =c("hgnc_symbol",
"hgnc_symbol"),
filters = c("hgnc_symbol"),
values = texp_filtered$gene_symbol,
mart= ensembl)
saveRDS(gene_conversion_cur, conversion_stash_cur)
}
colnames(gene_conversion_cur) <- c("Merge", "Hugo_Symbol")
normalized_counts_annot <-merge(gene_conversion_cur, normalized_counts,
by.x = "Merge",
by.y = "row.names", all.y = TRUE)
normalized_counts_annot[which(is.na(normalized_counts_annot$Hugo_Symbol)), ]
Question5: Were there expression values that were not
unique for specific genes?
As the table shown above,
using the most current version of ensembl, there are 2560 expression
cannot be mapped to the current version of genes. Because this dataset
is published in July 2021, but the data preparation started way brefore
this, and I think they might have used older versions of ensembl mart.
Therefore, I ran the following code and identified that 400 more
expressions can be mapped from a older version.
conversion_stash_old <- "gene_conversion_old.rds"
if (file.exists(conversion_stash_old)) {
gene_conversion_old <- readRDS(conversion_stash_old)
} else {
gene_conversion_old <- biomaRt::getBM(attributes =c("hgnc_symbol",
"hgnc_symbol"),
filters = c("hgnc_symbol"),
values = texp_filtered$gene_symbol,
mart= ensembl_93)
saveRDS(gene_conversion_old, conversion_stash_old)
}
length(which(rownames(normalized_counts) %in% gene_conversion_cur$hgnc_symbol))
## [1] 0
length(which(rownames(normalized_counts) %in% gene_conversion_old$hgnc_symbol))
## [1] 14494
colnames(gene_conversion_old) <- c("Merge", "Hugo_Symbol")
normalized_counts_annot_old <-merge(gene_conversion_old, normalized_counts,
by.x = "Merge",
by.y = "row.names", all.y = TRUE)
normalized_counts_annot_old[which(is.na(normalized_counts_annot_old$Hugo_Symbol)), ]
As you can see from the table, the first two genes are mapped to
previous HGNC symbols.
Question6: Were there expression
values that were not unique for specific genes? How did you handle
this?
I think the authors already tried to clean the
data and the data I am using now is a result of they already tried to
mapped all identifiers to HGNC symbols.Therefore, no duplicated genes
are found in the dataset.
all_outputs <- c(gene_conversion_cur[,1],
gene_conversion_old[,1])
new_genes <- data.frame(unique(all_outputs), unique(all_outputs))
colnames(new_genes) <- c("Merge", "Hugo_Symbol")
normalized_counts_annot_all <-merge(new_genes, normalized_counts,
by.x = "Merge",
by.y = "row.names", all.y = TRUE)
expr_not_mapped <- normalized_counts_annot_all[which(is.na(normalized_counts_annot_all$Hugo_Symbol)), ]
nrow(expr_not_mapped)
## [1] 2154
There are a total of 2154 expression data not mapped.
cleaned_expr <- normalized_counts_annot_all[which(!is.na(normalized_counts_annot$Hugo_Symbol)), ]
new_rownames <- cleaned_expr[,1]
cleaned_expr <- cleaned_expr[,c(-1,-2)]
rownames(cleaned_expr) <- new_rownames
kable(cleaned_expr[1:20, 1:10], format("html"))
| RR0438670TR#1 | RR0947025TR#1 | RR0611936TR#1 | AC3034431TR#1 | AC3032552TR#1 | HH0000002TR#1 | HH0000003TR#1 | RR0277045TR#1 | RR0866687TR#1 | RR0919381TR#1 | |
|---|---|---|---|---|---|---|---|---|---|---|
| A1BG | 8.1753000 | 3.828280 | 1.7599736 | 2.228690 | 2.346909 | 0.4244309 | 1.379698 | 2.2220154 | 5.6833355 | 2.180196 |
| A1BG-AS1 | 2.5154769 | 1.531312 | 4.2742217 | 3.900208 | 4.302666 | 2.1221546 | 3.794170 | 1.5871538 | 10.4693022 | 1.453464 |
| A2M | 20.4382501 | 59.338344 | 11.8169659 | 23.401247 | 36.768237 | 12.3084966 | 15.521606 | 36.8219693 | 6.8798271 | 3.633660 |
| A2M-AS1 | 14.1495578 | 21.438370 | 3.7713721 | 14.486486 | 16.819513 | 4.6687401 | 1.724623 | 14.6018154 | 2.6921063 | 1.090098 |
| A2ML1 | 0.6288692 | 1.148484 | 0.7542744 | 4.457380 | 1.955757 | 2.1221546 | 1.379698 | 0.6348615 | 0.8973688 | 1.090098 |
| A2MP1 | 2.8299116 | 6.508076 | 1.5085488 | 6.128898 | 1.173454 | 1.2732927 | 1.034774 | 4.1266000 | 2.0938604 | 2.543562 |
| AAAS | 63.5157927 | 47.087848 | 58.5819798 | 47.359666 | 61.019627 | 74.6998412 | 75.193559 | 69.1999078 | 76.2763444 | 54.868262 |
| AACS | 13.8351232 | 13.781809 | 21.3711085 | 15.043659 | 19.948724 | 16.5528057 | 17.246229 | 10.7926462 | 15.2552689 | 11.991077 |
| AAGAB | 79.5519582 | 49.384816 | 63.3590511 | 49.588356 | 67.278051 | 75.9731339 | 63.466123 | 67.6127539 | 92.4289821 | 79.940513 |
| AAK1 | 342.1048635 | 298.988690 | 368.3373409 | 220.083154 | 391.933761 | 301.7703810 | 323.194334 | 339.0160618 | 184.5588412 | 224.923536 |
| AAMDC | 5.0309539 | 11.867669 | 9.0512930 | 9.471933 | 14.472604 | 11.0352038 | 10.347737 | 8.5706308 | 7.7771959 | 9.084149 |
| AAMP | 99.0469044 | 93.027211 | 99.0613736 | 84.690226 | 78.621443 | 140.9110640 | 113.825112 | 131.4163386 | 138.4939117 | 100.289008 |
| AAR2 | 36.7888502 | 55.892892 | 64.3647504 | 25.072764 | 31.292117 | 66.2112228 | 67.605218 | 75.5485231 | 72.0886236 | 93.385054 |
| AARS2 | 38.3610233 | 33.688867 | 22.8796574 | 12.814969 | 32.465571 | 42.4430916 | 31.043212 | 18.4109846 | 38.8859795 | 18.895031 |
| AARSD1 | 4.4020846 | 4.593936 | 8.0455938 | 3.343035 | 3.911515 | 6.3664637 | 6.898492 | 11.7449385 | 9.2728105 | 2.906928 |
| AASDH | 21.0671194 | 19.907058 | 24.3882062 | 27.301455 | 28.945208 | 19.9482530 | 21.040400 | 17.4586923 | 18.5456210 | 18.531665 |
| AASDHPPT | 27.6702463 | 36.751491 | 37.4622961 | 56.274427 | 38.332843 | 38.1987824 | 50.358989 | 25.0770308 | 52.0473880 | 47.964308 |
| AASS | 11.0052116 | 25.266650 | 12.5712403 | 18.386694 | 1.955757 | 16.5528057 | 30.698288 | 3.1743077 | 8.0763188 | 1.453464 |
| AATBC | 8.1753000 | 8.422217 | 4.2742217 | 3.343035 | 6.649575 | 3.3954473 | 5.863718 | 12.0623692 | 0.5982458 | 2.906928 |
| AATF | 118.2274161 | 99.152459 | 138.5350681 | 89.704779 | 108.740105 | 124.7826892 | 142.798777 | 137.4475232 | 131.3149616 | 178.776057 |
As shown in this table, we have produced a cleaned dataset with HGNC
symbols are row names.
Question7: What is the final
coverage of your dataset?
We have 14500 genes left
excluding the ones are not being mapped 2154 expression. They make up
the 16654 genes after removing low counts.